#!/usr/bin/python
#coding:utf-8
import urllib.request
import re
import chardet   #需要导入这个模块，检测编码格式

def download(url,user_agent='wswp',num_reload=5):
    headers={'User-agent':user_agent}
    request=urllib.request.Request(url,headers=headers)
    try:
        html=urllib.request.urlopen(request).read()
    except urllib.request.URLError as e:
        print('Downloading error:',e.reason)
        html = None
        if num_reload>0 and ( hasattr(e,'code') and 500<=e.code<=600 ):
            return download(url,user_agent,num_reload-1)
    encode_type = chardet.detect(html)
    html = html.decode(encode_type['encoding'])
    return html

def crawl_sitemap(url):
    sitemap = download(url) #下载网页文件
    print(sitemap)
    links = re.findall('<loc>(.*?)</loc>',sitemap) # 提取sitemap文件里的格式化连接
    for link in links:
        print(link)
        html = download(link)
       # print html

crawl_sitemap('http://example.webscraping.com/sitemap.xml')